Parse a single JSON


In [ ]:
def parseRaw(json_map):
    url = json_map['url']
    content = json_map['html']
    return (url,content)

使用 BeautifulSoup 擷取 HTML 內容,並套用 Jieba 斷詞


In [ ]:
## getContent: for input aritcle, get it own word set via jieba.cut()
def getContent(x):
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(x)
    text = soup.getText().replace('\n','').replace('\r','').replace(' ','').replace('\t','')
    import jieba
    r = list()
    for term in jieba.cut(text):
        if len(term) > 1 and checkword(term): r.append(term)
    return r

def checkword(x):
    return all(u'\u4e00' <= c <= u'\u9fff' for c in x)

取得 (url,content) List RDD


In [ ]:
import json
travel_content = sc.textFile("./pixnet.txt").map(json.loads).map(parseRaw)
makeup_content = sc.textFile("./makeup.txt").map(json.loads).map(parseRaw)

印出文章列表


In [ ]:
print(travel_content.map(lambda x:x[0]).collect())
print(makeup_content.map(lambda x:x[0]).collect())

取得 (Link, List(Token)),並計算總Token 數


In [ ]:
travel_token = travel_content.map(lambda x : (x[0], getContent(x[1])))
makeup_token = makeup_content.map(lambda x : (x[0], getContent(x[1])))

def countTokens(tokenRDD):
    return tokenRDD.map(lambda x: len(x[1])).reduce(lambda a, b: a + b)

totalTokens = countTokens(travel_token) + countTokens(makeup_token)
print('There are %s tokens in full datasets' % totalTokens)

計算最多斷詞的文章


In [ ]:
trainRDD = travel_token.union(makeup_token)

def findBiggestArticle(fullRDD):
    return fullRDD.sortBy(lambda x: -len(x[1])).take(1)

biggestArticle = findBiggestArticle(trainRDD)
print('The biggest article with Link "%s" has the most tokens (%s)' % (biggestArticle[0][0],
                                                                   len(biggestArticle[0][1])))

TF-IDF 分數計算

TF-IDF 是一種常用於 Data-Mining 的文章權重計算方法,分別衡量一個斷詞在一篇文章及整個文件集的重要程度。

  • IDF(inverse document frequency): 當一個詞在越少文章出現,其出現對文章的重要性就越大。
  • TF(term frequency): 當一個詞在單篇文章出現的頻率越大,其對文章的重要性也越大。

對於每一篇文章,將其內部所有 token 的 TF、IDF值求出並相乘,將會產生這篇文章的 weighted vector。 最終,可以透過計算兩篇文章的 cosine similarity ,判斷這兩篇文章的相似度。

Cosine Similarity
  • 兩篇文章重複的 token 越多,相似度越高。
  • 兩篇文章重複的 token 若其原本 TF-IDF 值越高,也會對 cosine similarity 貢獻更多分數。

In [ ]:
def tf(tokens):
    d = {}
    for word in tokens:
        if not word in d:
            d[word] = 1
        else:
            d[word] += 1
    for word in d:
        d[word] = float(d[word])/len(tokens)
    return d

travel_token_TF = travel_token.map(lambda record: tf(record[1]))
example_dict = travel_token_TF.take(1)[0]
example_dict_sorted = sorted(example_dict, key=example_dict.get, reverse=True)

print("Show 10 tokens with the higest frequency.")
for index in range(0,9):
    print(example_dict_sorted[index], example_dict[example_dict_sorted[index]])

In [ ]:
def idfs(RDD):
    N = RDD.count()
    uniqueTokens = RDD.map(lambda x: list(set(x[1])))
    tokenSumPairTuple = uniqueTokens.flatMap(lambda x: x).map(lambda x: (x, 1)).reduceByKey(lambda a, b : a + b)
    return (tokenSumPairTuple.map(lambda x: (x[0], float(N)/x[1])))

idfsTrain = idfs(trainRDD)
idfsTrainWeights = idfsTrain.collectAsMap()
uniqueTokenCount = idfsTrain.count()

print('There are %s unique tokens in the training datasets.' % uniqueTokenCount)

列出20個每篇文章都有出現的詞


In [ ]:
IDFTokens = idfsTrain.filter(lambda token: token[1] == 1).take(20)  ##takeOrdered(10, lambda s: -s[1])
for token in IDFTokens:
    print(token[0] + " " + str(token[1]))

列出20個只在一篇文章出現的詞


In [ ]:
IDFTokens = idfsTrain.filter(lambda token: token[1] == 10).take(20)
for token in IDFTokens:
    print(token[0] + " " + str(token[1]))

計算出一篇文章的 TF-IDF vector,回傳成 Dictionary 的形式


In [ ]:
def tfidf(tokens, idfs):
    tfs = tf(tokens)
    for tk in tfs:
        tfs[tk] = tfs[tk]*idfs[tk]
    tfIdfDict = tfs
    return tfIdfDict

def showTopWord(link):
    tokens = trainRDD.filter(lambda x: x[0] == link).collect()[0][1]
    tokens_weights = tfidf(tokens, idfsTrainWeights)
    tokens_weights_sorted = sorted(tokens_weights, key=tokens_weights.get, reverse=True)
    for index in range(0,9):
        print(tokens_weights_sorted[index], tokens_weights[tokens_weights_sorted[index]])
    
link = u'http://chahabi77.pixnet.net/blog/post/436715527'
showTopWord(link)

實作 Dot Product 涵式,透過 cosine similarity 計算文章相似度的分數

dotprod

  • 針對兩篇文章的tfidf() dictionary,針對所有共同 key 的值做相乘,並將結果加總

norm

  • 計算 cosine similarity 的 square root

cossim

  • 計算兩篇文章的 cosine similarity

In [ ]:
import math

def dotprod(a, b):
    dotsum = 0
    for tk in a:
        if tk in b:
            dotsum += a[tk]*b[tk]
    return dotsum

def norm(a):
    return math.sqrt(dotprod(a,a))

def cossim(a, b):
    return dotprod(a,b)/(norm(a) * norm(b))

In [ ]:
def cosineSimilarity(string1, string2, idfsDictionary):
    w1 = tfidf(string1, idfsDictionary)
    w2 = tfidf(string2, idfsDictionary)
    return cossim(w1, w2)

產生 10 篇文章的 Cartesian Coordinate ,並計算每兩篇文章的相似度


In [ ]:
crossPair = (trainRDD
              .cartesian(trainRDD)
              .cache())
crossPair.count()


Out[ ]:
100

In [ ]:
similarities = (crossPair 
                .map(lambda record: 
                     (record[0][0], record[1][0], cosineSimilarity(record[0][1], record[1][1], idfsTrainWeights)))
                .cache())

In [ ]:
def getSimilar(link):
    return (similarities
            .filter(lambda record: (record[0] == link))
            .map(lambda record: (record[1], record[2]))
            .sortBy(lambda x: -x[1]).collect())

輸入一篇文章,取得三篇最相似文章


In [ ]:
similarArticle = getSimilar(u'http://bowpisces.pixnet.net/blog/post/152118460')
for index in range(1, 4):
    print(similarArticle[index])

探索文章裡的關鍵字


In [ ]:
showTopWord(u'http://bowpisces.pixnet.net/blog/post/162504740')

In [ ]:
similarArticle = getSimilar(u'http://chahabi77.pixnet.net/blog/post/354943724')
for index in range(1, 4):
    print(similarArticle[index])

In [ ]:
showTopWord(u'http://chahabi77.pixnet.net/blog/post/235296791')

In [ ]: